# Load required libraries
library(tidyverse)
library(tidytext)
library(tm)
library(SnowballC)
library(Rtsne)
library(ggplot2)
library(plotly)
library(scales)
library(wordcloud)
library(viridis)
library(DT)

# Set random seed for reproducibility
set.seed(42)

# Read data (update path as needed)

df <- read.csv("https://raw.githubusercontent.com/JIHONGKING/Data_Analysis/refs/heads/main/filtered_amazon_reviews.csv", stringsAsFactors = FALSE)

# Clean and preprocess data
df_clean <- df %>%
  filter(!is.na(Text),
         Score %in% 1:5,
         nchar(Text) > 50,   # Filter out very short reviews
         nchar(Text) < 1000) %>% # Filter out extremely long reviews
  mutate(sentiment = case_when(
    Score <= 2 ~ "Negative",
    Score == 3 ~ "Neutral",
    Score >= 4 ~ "Positive"
  )) %>%
  distinct(Text, .keep_all = TRUE)  # Remove duplicates

# Display sentiment distribution before sampling
sentiment_counts <- df_clean %>%
  count(sentiment) %>%
  mutate(percentage = n / sum(n) * 100)

# Create a balanced sample with 500 reviews from each sentiment category
# Function to sample with replacement if n > available data
sample_n_or_all <- function(df, n) {
  if (nrow(df) >= n) {
    return(sample_n(df, n))
  } else {
    return(sample_n(df, n, replace = TRUE))
  }
}

# Stratified sampling for balanced sentiment representation
df_neg <- df_clean %>% filter(sentiment == "Negative") %>% sample_n_or_all(500)
df_neu <- df_clean %>% filter(sentiment == "Neutral") %>% sample_n_or_all(500)
df_pos <- df_clean %>% filter(sentiment == "Positive") %>% sample_n_or_all(500)

df_sample <- bind_rows(df_neg, df_neu, df_pos)

# Text preprocessing
reviews_tidy <- df_sample %>%
  select(Id, Text, sentiment) %>%
  unnest_tokens(word, Text) %>%
  # Remove stop words
  anti_join(stop_words) %>%
  # Filter out short words (often not meaningful)
  filter(nchar(word) > 2) %>%
  # Apply stemming to group related words
  mutate(word = wordStem(word)) %>%
  count(Id, sentiment, word, sort = TRUE) %>%
  ungroup()

# Calculate TF-IDF
tfidf <- reviews_tidy %>%
  bind_tf_idf(word, Id, n)

# Create Document-Term Matrix
dtm <- tfidf %>%
  cast_dtm(document = Id, term = word, value = tf_idf)

# Convert to matrix
dtm_mat <- as.matrix(dtm)

# Remove any zero-variance features
var_cols <- apply(dtm_mat, 2, var)
dtm_filtered <- dtm_mat[, var_cols > 0]

# Map back sentiment information
ids <- rownames(dtm_filtered)
sentiment_map <- df_sample %>% 
  select(Id, sentiment, Text, Score) %>%
  mutate(Id = as.character(Id))

# Run t-SNE
set.seed(123)  # For reproducibility
tsne_model <- Rtsne(dtm_filtered, dims = 2, 
                    perplexity = 30,  # Hyperparameter that balances local and global structure
                    max_iter = 1000,  # More iterations for better convergence
                    verbose = TRUE)
## Performing PCA
## Read the 1500 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.12 seconds (sparsity = 0.105289)!
## Learning embedding...
## Iteration 50: error is 71.855611 (50 iterations in 0.11 seconds)
## Iteration 100: error is 70.894975 (50 iterations in 0.10 seconds)
## Iteration 150: error is 70.872609 (50 iterations in 0.09 seconds)
## Iteration 200: error is 70.872434 (50 iterations in 0.08 seconds)
## Iteration 250: error is 70.872518 (50 iterations in 0.08 seconds)
## Iteration 300: error is 1.887525 (50 iterations in 0.07 seconds)
## Iteration 350: error is 1.662712 (50 iterations in 0.08 seconds)
## Iteration 400: error is 1.597569 (50 iterations in 0.08 seconds)
## Iteration 450: error is 1.569047 (50 iterations in 0.08 seconds)
## Iteration 500: error is 1.551669 (50 iterations in 0.08 seconds)
## Iteration 550: error is 1.540649 (50 iterations in 0.08 seconds)
## Iteration 600: error is 1.533899 (50 iterations in 0.08 seconds)
## Iteration 650: error is 1.528895 (50 iterations in 0.08 seconds)
## Iteration 700: error is 1.525544 (50 iterations in 0.08 seconds)
## Iteration 750: error is 1.521983 (50 iterations in 0.08 seconds)
## Iteration 800: error is 1.517806 (50 iterations in 0.08 seconds)
## Iteration 850: error is 1.515856 (50 iterations in 0.07 seconds)
## Iteration 900: error is 1.513360 (50 iterations in 0.07 seconds)
## Iteration 950: error is 1.513138 (50 iterations in 0.06 seconds)
## Iteration 1000: error is 1.510683 (50 iterations in 0.06 seconds)
## Fitting performed in 1.57 seconds.
# Combine t-SNE results with metadata
df_tsne <- as.data.frame(tsne_model$Y) %>%
  mutate(Id = ids) %>%
  left_join(sentiment_map, by = "Id") %>%
  # Create a shortened text version for tooltips
  mutate(short_text = str_trunc(Text, 200))

# Create custom tooltip for interactive plot
tooltip_text <- paste0(
  "Rating: ", df_tsne$Score, "<br>",
  "Sentiment: ", df_tsne$sentiment, "<br>",
  "Review: ", df_tsne$short_text
)

# Interactive visualization
p <- plot_ly(df_tsne,
        x = ~V1,
        y = ~V2,
        type = "scatter",
        mode = "markers",
        color = ~sentiment,
        colors = c("Negative" = "#E74C3C", 
                  "Neutral" = "#7F8C8D", 
                  "Positive" = "#2ECC71"),
        text = tooltip_text,
        hoverinfo = "text",
        marker = list(
          size = 8,
          opacity = 0.7,
          line = list(width = 1, color = "#FFFFFF")
        )) %>%
  layout(
    title = list(
      text = "Interactive t-SNE Map of Amazon Food Review Sentiments",
      font = list(size = 20)
    ),
    xaxis = list(
      title = "t-SNE Dimension 1",
      zeroline = FALSE,
      showgrid = FALSE
    ),
    yaxis = list(
      title = "t-SNE Dimension 2",
      zeroline = FALSE,
      showgrid = FALSE
    ),
    legend = list(
      title = list(text = "Sentiment"),
      x = 0.01,
      y = 0.99,
      bgcolor = "rgba(255, 255, 255, 0.7)"
    ),
    margin = list(l = 50, r = 50, b = 100, t = 100),
    annotations = list(
      x = 0.5,
      y = -0.1,
      xref = "paper",
      yref = "paper",
      text = "Hover over points to see review details. Notice how sentiments cluster in the semantic space.",
      showarrow = FALSE
    )
  )

# Static visualization 1: Top words by sentiment
# Calculate word importance per sentiment
word_importance <- reviews_tidy %>%
  group_by(sentiment, word) %>%
  summarize(
    frequency = n(),
    total_reviews = n_distinct(Id),
    .groups = "drop"
  ) %>%
  # Filter words that appear in at least 10 reviews
  filter(frequency >= 10) %>%
  # Calculate the word's distinctiveness for each sentiment
  group_by(word) %>%
  mutate(
    total_freq = sum(frequency),
    sentiment_ratio = frequency / total_freq
  ) %>%
  ungroup() %>%
  # Keep words that are highly associated with a sentiment
  filter(sentiment_ratio > 0.6)

# Get top words for each sentiment
top_words <- word_importance %>%
  group_by(sentiment) %>%
  top_n(15, frequency) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, frequency, sentiment))

# Static visualization 2: Sentiment distribution by rating
rating_sentiment_plot <- df_sample %>%
  count(Score, sentiment) %>%
  group_by(Score) %>%
  mutate(percentage = n / sum(n) * 100) %>%
  ggplot(aes(x = factor(Score), y = sentiment, fill = percentage)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(percentage), "%")), 
            color = "white", fontface = "bold") +
  scale_fill_viridis(option = "D", direction = -1) +
  labs(
    title = "Sentiment Distribution Across Rating Spectrum",
    subtitle = "Mapping between numerical ratings and sentiment categories",
    x = "Rating (1-5 stars)",
    y = NULL,
    fill = "% of Reviews"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",
    panel.grid = element_blank(),
    axis.text = element_text(size = 12),
    plot.title = element_text(face = "bold")
  )

# Display visualizations
p